Completed
Push — master ( 090c72...ff33da )
by Dylan
04:20 queued 01:54
created

crawler.CRAWL_FINISHED   C

Complexity

Conditions 7
Paths 6

Size

Total Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 1 Features 0
Metric Value
dl 0
loc 23
rs 6.7272
c 1
b 1
f 0
cc 7
nc 6
nop 0
1
const default_tests = {
2
3
    // [name, title, headers, type]
4
    tests: [
5
        ['bad_links', 'BAD LINKS', ['URL', 'Linked From'], ''],
6
        ['h1_info', 'H1 INFO', ['URL', 'Count', 'Text', 'Status']],
7
        ['h2_info', 'H2 INFO', ['URL', 'Count', 'Text', 'Status']],
8
        ['word_count', 'WORD COUNT', ['URL', 'Word Count', 'Article Word Count'], 'info'],
9
        ['int_link_info', 'INTERNAL LINKS',
10
            ['URL', 'Article Links', 'Article Link Count', 'Article Density', 'Total Link Count', 'Total Density', 'Status'],
11
            'info'],
12
        ['ext_link_info', 'EXTERNAL LINKS', ['URL', 'External Link Count', 'External Links'], 'success'],
13
        ['img_info', 'IMAGES', ['URL', 'Count', 'Missing Alt Tag', 'Missing Title Tag', 'Fields Missing Images', 'Status'], 'success'],
14
        ['title_info', 'META TITLE', ['URL', 'Meta Title', 'Length', 'Status']],
15
        ['description_info', 'META DESCRIPTION', ['URL', 'Meta Description', 'Length', 'Status']],
16
        ['canonical_info', 'CANONICAL', ['URL', 'Status'], 'success'],
17
        ['noindex_pages', 'NO-INDEX PAGES', ['URL'], 'success'],
18
        ['urls_test', 'URL STRUCTURE', ['URL', 'Status'], 'success'],
19
        ['duplicate_meta_tags', 'DUPLICATE META TAGS', ['URL', 'Status']],
20
        ['href_langs', 'LANG TAGS', ['URL', 'Tags'], 'info'],
21
        ['orphan_pages', 'ORPHAN PAGES', ['URL']],
22
        ['redirect_links', 'REDIRECT LINKS', ['Link', 'In', 'Redirects To']],
23
        ['social_tests', 'SOCIAL', ['URL', 'Status']]
24
    ],
25
26
    /**
27
     * Test the h1s on the page provided
28
     *
29
     * @param {string} url
30
     * @param {jQuery} html
31
     * @returns {undefined}
32
     */
33
    h1_info: function(url, html){
34
        var h1      = html.find( 'h1' ),
35
            link    = crawler.painter.create_link(url, url),
36
            joined  = [],
37
            status;
38
39
        h1.each(function(){ joined.push(this.innerHTML); });
40
41
        if(h1.length != 1)
42
            status = crawler.painter.create_status('error', (h1.length < 1) ? 'Missing H1' : 'Multiple H1 tags');
43
        else status = crawler.painter.create_status('success', 'OK!');
44
45
        return crawler.painter.add_row(this.name, [link, h1.length, joined.join(', '), status]);
46
    },
47
48
    /**
49
     * Test the h2s on the page provided
50
     *
51
     * @param {string} url
52
     * @param {jQuery} html
53
     * @returns {undefined}
54
     */
55
    h2_info: function(url, html){
56
        var h2      = html.find( 'h2' ),
57
            link    = crawler.painter.create_link(url, url),
58
            joined  = [], status;
59
60
        h2.each(function(){ joined.push(this.innerHTML); });
61
62
        if(h2.length < 1) status = crawler.painter.create_status('warning', 'Missing H2');
63
        else status = crawler.painter.create_status('success', 'OK!');
64
65
        return crawler.painter.add_row(this.name, [link, h2.length, joined.join(', '), status]);
66
    },
67
68
    /**
69
     * Check the word count for the passed page
70
     *
71
     * @param {string} url
72
     * @param {jQuery} html
73
     * @param {string} headers
74
     * @param {Array} field_data
75
     * @param {Array} phrases
76
     * @returns {undefined}
77
     */
78
    word_count: function(url, html, headers, field_data, phrases){
79
        var link        = crawler.painter.create_link(url, url),
80
            word_count  = crawler.get_word_count(phrases),
81
            art_count   = crawler.get_word_count(field_data[3]);
82
83
        return crawler.painter.add_row(this.name, [link, word_count, art_count]);
84
    },
85
86
    /**
87
     * Test the internal links found on the page provided
88
     *
89
     * @param {string} url
90
     * @param {jQuery} html
91
     * @param {string} headers
92
     * @param {Array} field_data
93
     * @param {Array} phrases
94
     * @returns {undefined}
95
     */
96
    int_link_info: function(url, html, headers, field_data, phrases){
97
        var link = crawler.painter.create_link(url, url),
98
            art_links = [], links = [];
99
100
        // Article links
101
        for( var field in field_data[2] ) {
102
            $.each($(field_data[2][field]).find('a'), function () {
103
                var href = $(this).attr('href');
104
                if(href && !crawler.is_external(href) && !crawler.is_anchor(href, url)) art_links.push(href);
105
            });
106
        }
107
108
        // Full page links
109
        $.each(html.find('a'), function () {
110
            var href = $(this).attr('href');
111
            if(href && !crawler.is_external(href) && !crawler.is_anchor(href, url)) links.push(href);
112
        });
113
114
        var art_word_count  = crawler.get_word_count(field_data[3]),
115
            art_density     = (art_links.length > 0) ? art_word_count / art_links.length : false,
116
            art_dens_text   = (art_density != false) ? art_density.toFixed(2) +' words/link' : 'No internal links',
117
            word_count      = crawler.get_word_count(phrases),
118
            density         = (links.length > 0) ? word_count / links.length : false,
119
            dens_text       = (density != false) ? density.toFixed(2) +' words/link' : 'No internal links',
120
            status          = crawler.painter.create_status('success', 'OK!');
121
122
        if( ( art_density !== false && art_density < 100 ) )
123
            status = crawler.painter.create_status('warning', 'This page might be considered spammy');
124
125
        if(links.length > 0)
126
            crawler.painter.add_row( this.name, [
127
                link, art_links.join('<br />'), art_links.length, art_dens_text, links.length, dens_text, status
128
            ]);
129
130
        return undefined;
131
    },
132
133
    /**
134
     * Test the external links on the page provided
135
     *
136
     * @param {string} url
137
     * @param {jQuery} html
138
     * @param {string} headers
139
     * @param {Array} field_data
140
     * @returns {undefined}
141
     */
142
    ext_link_info: function(url, html, headers, field_data){
143
        var link = crawler.painter.create_link(url, url),
144
            links = [];
145
146
        for( var field in field_data[2] ) {
147
            $.each($(field_data[2][field]).find('a'), function () {
148
                var $this = $(this),
149
                    href = $this.attr('href');
150
                if(href && crawler.is_external(href)){
151
                    var type = ( !$this.attr('rel') || $this.attr('rel').toLowerCase().indexOf('nofollow') < 0 )
152
                        ? 'warning' : 'info';
153
                    links.push(
154
                        $('<div class="clearfix"></div>').append([
155
                            crawler.painter.create_status(type, href),
156
                            '<p>&nbsp;</p>'
157
                        ])
158
                    );
159
                }
160
            });
161
        }
162
163
        if(links.length > 0){
164
            crawler.painter.add_row(this.name, [link, links.length, links]);
165
        }
166
167
        return undefined;
168
    },
169
170
    /**
171
     * Test the images on the page provided
172
     *
173
     * @param {string} url
174
     * @param {jQuery} html
175
     * @param {string} headers
176
     * @param {Array} field_data
177
     * @returns {undefined}
178
     */
179
    img_info: function(url, html, headers, field_data) {
180
        var link = crawler.painter.create_link(url, url),
181
            imgs = html.find('img'),
182
            alt = 0, title = 0, fields = [], status = '';
183
184
        // Check alt and title tags
185
        $.each(imgs, function () {
186
            var $this = $(this);
187
            if (!$this.attr('alt') || $this.attr('alt').length < 1) alt += 1;
188
            if (!$this.attr('title') || $this.attr('title').length < 1) title += 1;
189
        });
190
191
        // Check the fields
192
        for (var f in field_data[2]) if ($(field_data[2][f]).find('img').length < 1) fields.push(field_data[1][f]);
193
194
        // Construct Result
195
        if (alt > 0)
196
            status = crawler.painter.create_status('error',
197
                (alt > 1) ? alt + ' images missing alt tag' : '1 image missing alt tag');
198
        else if(fields.length > 0)
199
            status = crawler.painter.create_status('warning',
200
                (fields.length > 1) ? fields.join(' and ') + ' are missing images' : fields[0] + ' is missing images');
201
        else if(title > 0)
202
            status = crawler.painter.create_status('info',
203
                (title > 1) ? title + ' images missing title tag' : '1 image is missing title tag');
204
        else
205
            status = crawler.painter.create_status('success', 'OK!');
206
207
        return crawler.painter.add_row(this.name, [link, imgs.length, alt, title, fields.join(', '), status]);
208
    },
209
210
    /**
211
     * Test the meta title of the page provided
212
     *
213
     * @param {string} url
214
     * @param {jQuery} html
215
     * @returns {undefined}
216
     */
217
    title_info: function(url, html){
218
        var title   = html.filter( 'title' ),
219
            link    = crawler.painter.create_link(url, url),
220
            text    = (title.length == 1) ? title.html() : '',
221
            status  = default_tests.get_meta_tags_status(title, 'meta title', text, 40, 56);
222
223
        if(title.length == 1){
224
            crawler.set_property('meta_titles', text, url);
225
        }
226
227
        return crawler.painter.add_row(this.name, [link, text, text.length, status]);
228
    },
229
230
    /**
231
     * Test the meta description for the page provided
232
     *
233
     * @param {string} url
234
     * @param {jQuery} html
235
     * @returns {undefined}
236
     */
237
    description_info: function(url, html){
238
        var desc    = html.filter( 'meta[name=description]' ),
239
            link    = crawler.painter.create_link(url, url),
240
            text    = (desc.length == 1) ? desc.attr('content') : '',
241
            status  = default_tests.get_meta_tags_status(desc, 'meta description', text, 70, 156);
242
243
        if( desc.length == 1 ){
244
            crawler.set_property('descriptions', text, url);
245
        }
246
247
        return crawler.painter.add_row(this.name, [link, text, text.length, status]);
248
    },
249
250
    /**
251
     * Test the canonical rules for the page provided
252
     *
253
     * @param {string} url
254
     * @param {jQuery} html
255
     * @returns {undefined}
256
     */
257
    canonical_info: function(url, html){
258
        var tags = default_tests.get_tags(html, 'link', 'rel', 'canonical');
259
260
        if(tags.length != 1) {
261
            var status = crawler.painter.create_status('error', 'Missing / Multiple canonicals found');
262
            crawler.painter.add_row(this.name, [crawler.painter.create_link(url, url), status]);
263
        }else{
264
            crawler.set_property('canonicals', tags[0].attr('href'), url);
265
        }
266
267
        return undefined;
268
    },
269
270
    /**
271
     * Check if the page provided has a no-index header
272
     *
273
     * @param {string} url
274
     * @param {jQuery} html
275
     * @returns {undefined}
276
     */
277
    noindex_pages: function(url, html) {
278
        if(default_tests.get_tags(html, 'meta', 'content', 'noindex').length > 0){
279
            crawler.painter.add_row(this.name, [crawler.painter.create_link(url, url)]);
280
            crawler.painter.set_type(this.name, 'error');
281
        }
282
283
        return undefined;
284
    },
285
286
    /**
287
     * Test the url passed for it's structure
288
     *
289
     * @param url
290
     * @returns {undefined}
291
     */
292
    urls_test: function(url){
293
        var link = crawler.painter.create_link(url, url),
294
            msg;
295
296
        if( url.length > 115 )                  msg = 'URL is too long';
297
        else if( url.toLowerCase() != url )     msg = 'URL is not in lower case';
298
        else if( url.replace('_','') !== url )  msg = 'URL contains under scores';
299
        else return undefined;
300
301
        return crawler.painter.add_row(this.name, [link, crawler.painter.create_status('warning', msg)]);
302
    },
303
304
    /**
305
     * Check for href lang tags in the page provided
306
     *
307
     * @param {string} url
308
     * @param {jQuery} html
309
     * @returns {undefined}
310
     */
311
    href_langs: function(url, html){
312
        var link    = crawler.painter.create_link(url, url),
313
            tags    = [];
314
315
        $.each( html.filter( 'link' ), function(){
316
            if( $(this).attr( 'hreflang' ) )
317
                tags.push( $('<p>').text( $(this).clone().wrap('<p>').parent().html() ).html() );
318
        });
319
320
        if( tags.length > 0 ) crawler.painter.add_row(this.name, [link, tags.join('<br />')] );
321
322
        return undefined;
323
    },
324
325
    /**
326
     * Test for social markup on the page provided
327
     *
328
     * @param {string} url
329
     * @param {jQuery} html
330
     * @returns {undefined}
331
     */
332
    social_tests: function(url, html){
333
        var errors  = [],
334
            link    = crawler.painter.create_link(url, url),
335
            fb_tags = {
336
                'og:title': 'Open Graph Title (og:title)',
337
                'og:type': 'Open Graph Type (og:type)',
338
                'og:image': 'Open Graph Image (og:image)',
339
                'og:url': 'Open Graph URL (og:url)',
340
                'og:description': 'Open Graph Description (og:description)',
341
                'og:locale': 'Open Graph Language (og:locale)',
342
                'og:site_name': 'Open Graph Site Name (og:site_name)'
343
            },
344
            twitter_cards = {
345
                'twitter:card': 'Twitter Card Type (twitter:card)',
346
                'twitter:site': 'Twitter Site @username (twitter:site)'
347
            };
348
349
        // OG Tags
350
        for( var f in fb_tags ){
351
            if( default_tests.get_tags(html, 'meta', 'property', f).length != 1 ){
352
                errors.push( "Missing/Multiple "+ fb_tags[f] );
353
            }
354
        }
355
356
        // Twitter Cards
357
        for( var t in twitter_cards ){
358
            if( default_tests.get_tags(html, 'meta', 'name', t).length != 1 ){
359
                errors.push( "Missing/Multiple "+ twitter_cards[t] );
360
            }
361
        }
362
363
        // Google Publisher
364
        if( default_tests.get_tags(html, 'link', 'rel', 'publisher').length != 1 ){
365
            errors.push( 'Missing / Multiple publisher tag' );
366
        }
367
368
        if( errors.length > 0 ){
369
            var errs = [];
370
            for( var e in errors ){
371
                errs.push( $('<p class="row clearfix"></p>').append(errors[e]) );
372
            }
373
            crawler.painter.add_row(this.name, [link, errs]);
374
            crawler.painter.set_type(this.name, 'warning');
375
        }
376
377
        return undefined;
378
    },
379
380
    /**
381
     * Returns a list of jQuery Objects that are of type {tag},
382
     * have an attribute {key} an it's value is {value}
383
     *
384
     * @param {jQuery} html
385
     * @param {string} tag
386
     * @param {string} key
387
     * @param {string} value
388
     * @returns {Array}
389
     */
390
    get_tags: function(html, tag, key, value){
391
        var returns = [];
392
393
        $.each(html.filter(tag), function(){
394
            var $this = $(this);
395
            if( $this.attr(key) && $this.attr(key) == value ){
396
                returns.push($this);
397
            }
398
        });
399
400
        return returns;
401
    },
402
403
    /**
404
     * Goes through an object and tries to find a key that has a value matching the value passed
405
     *
406
     * @param {*} object
407
     * @param {*} search
408
     * @returns {*}
409
     */
410
    get_key_from_object: function(object, search){
411
        for( var key in object ) if( object[key].indexOf(search) >= 0 ) return key;
412
        return undefined;
413
    },
414
415
    /**
416
     * Gets the status box for the meta tag being tested
417
     * Append to the crawler.painter
418
     *
419
     * @param {Array} tags
420
     * @param {string} tag_name
421
     * @param {string} text
422
     * @param {int} min_char
423
     * @param {int} max_char
424
     * @returns {jQuery}
425
     */
426
    get_meta_tags_status: function(tags, tag_name, text, min_char, max_char){
427
        if( tags.length > 1 ){
428
            return crawler.painter.create_status('error', 'Multiple '+tag_name+' tags');
429
        }else if( tags.length < 1 ){
430
            return crawler.painter.create_status('error', 'Missing '+tag_name+' tag');
431
        }else{
432
            var len = text.length;
433
            if(len < min_char){
434
                return crawler.painter.create_status('warning', tag_name+' is too short');
435
            }else if(len > max_char){
436
                return crawler.painter.create_status('warning', tag_name+' is too long');
437
            }else{
438
                return crawler.painter.create_status('success', 'OK!');
439
            }
440
        }
441
    },
442
443
    /**
444
     * Return a string of links if there is a list of linked_from for the given url
445
     * else return false
446
     *
447
     * @param {string} url
448
     * @returns {string|boolean}
449
     */
450
    get_linked_from_links: function(url){
451
        if( crawler.linked_from.hasOwnProperty( url ) ) {
452
            var linked_from = [];
453
            for (var lf in crawler.linked_from[url]) {
454
                var link = crawler.painter.create_link(crawler.linked_from[url][lf], crawler.linked_from[url][lf]);
455
                linked_from.push(link);
456
            }
457
            return linked_from.join('<br />');
458
        }else{
459
            return false;
460
        }
461
    }
462
}
463
464
// Register the tests
465
crawler.event_handler.on('BEFORE_INIT', function(){
466
    for( var t in default_tests.tests ){
467
        var test = default_tests.tests[t],
468
            func = default_tests.hasOwnProperty( test[0] ) ? default_tests[test[0]] : false;
469
470
        crawler.regiser_test(test[0], test[1], test[2], func);
471
        crawler.painter.set_type(test[0], test[3] || 'default');
472
    }
473
});
474
475
// When crawler is done check for orphan pages
476
crawler.event_handler.on('ALL_CRAWLS_FINISHED', function(){
477
    crawler.painter.set_type('orphan_pages', 'success');
478
    pages_loop:
479
        for( var i in crawler.tested ){
480
            var url = crawler.tested[i];
481
482
            if( crawler.failed.indexOf(url) >= 0 || crawler.sitemap.indexOf(url) >= 0 ){
483
                continue;
484
            }
485
486
            if( crawler.linked_from.hasOwnProperty(url) ) {
487
                for (var x in crawler.linked_from[url])
488
                    if (crawler.linked_from[url][x] != url) continue pages_loop;
489
            }
490
491
            crawler.painter.add_row('orphan_pages', [crawler.painter.create_link(crawler.tested[i], crawler.tested[i])]);
492
            crawler.painter.set_type('orphan_pages', 'error');
493
        }
494
495
    return true;
496
});
497
498
// When crawler is done check for duplicate meta tags
499
crawler.event_handler.on('ALL_CRAWLS_FINISHED', function(){
500
    crawler.painter.set_type('duplicate_meta_tags', 'success');
501
502
    var canonicals = crawler.canonicals,
503
        tests      = {
504
            'meta_titles'   : 'Urls have same meta title but different canonicals',
505
            'descriptions'  : 'Urls have same meta description but different canonicals'
506
        };
507
508
    for(var test in tests){
509
        for(var x in crawler[test]){
510
            var urls = crawler[test][x];
511
            if( urls < 2 ) continue;
512
            var canonical = default_tests.get_key_from_object(canonicals, urls[0]);
513
            for( var i in urls )
514
                if( canonical != default_tests.get_key_from_object(canonicals, urls[i]) ) {
515
                    var status = crawler.painter.create_status('error', tests[test]);
516
                    crawler.painter.add_row('duplicate_meta_tags', [urls.join(', '), status]);
517
                    break;
518
                }
519
        }
520
    }
521
522
    return undefined;
523
});
524
525
// When crawler is done check for bad links
526
crawler.event_handler.on('ALL_CRAWLS_FINISHED', function(){
527
    crawler.painter.set_type('bad_links', 'success');
528
    for(var f in crawler.failed){
529
        var links = default_tests.get_linked_from_links(crawler.failed[f]);
530
        if( links != false ){
531
            crawler.painter.add_row('bad_links', [crawler.failed[f], links]);
532
            crawler.painter.set_type('bad_links', 'error');
533
        }
534
    }
535
    return undefined;
536
});
537
538
// When crawler is done check for redirect links
539
crawler.event_handler.on('ALL_CRAWLS_FINISHED', function(){
540
    crawler.painter.set_type('redirect_links', 'success');
541
    for(var r in crawler.redirects){
542
        var links = default_tests.get_linked_from_links(r);
543
        if( links != false ){
544
            crawler.painter.add_row('redirect_links', [r, links, crawler.redirects[r]]);
545
            crawler.painter.set_type('redirect_links', 'warning');
546
        }
547
    }
548
    return undefined;
549
});
550